1. Data

Information about the test:

question description MATH_group
B1 properties of fractions A
B2 composition of functions B
B3 completing the square A
B4 trig double angle formula A
B5 trig wave function A
B6 graphical vector sum B
B7 angle between 3d vectors (in context) B
B8 simplify logs A
B9 identify graph of rational functions B
B10 2x2 system possibilities B
B11 using max and min functions C
B12 find point with given slope A
B13 equation of tangent A
B14 find minimum gradient of cubic B
B15 find and classify stationary points of cubic A
B16 trig chain rule A
B17 chain rule A
B18 definite integral A
B19 area between curve and x-axis (in 2 parts) B
B20 product rule with given values B

Load the student scores for the test:

Show data summary

test_scores %>% skim()
Data summary
Name Piped data
Number of rows 5500
Number of columns 23
_______________________
Column type frequency:
character 2
numeric 21
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
year 0 1 7 7 0 5 0
AnonID 0 1 9 9 0 5435 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
Total 0 1.00 70.82 20.47 0 59.69 75.0 86.00 100 <U+2581><U+2581><U+2583><U+2587><U+2587>
B1 39 0.99 4.46 1.45 0 5.00 5.0 5.00 5 <U+2581><U+2581><U+2581><U+2581><U+2587>
B2 211 0.96 3.89 2.08 0 5.00 5.0 5.00 5 <U+2582><U+2581><U+2581><U+2581><U+2587>
B3 132 0.98 4.21 1.30 0 3.75 5.0 5.00 5 <U+2581><U+2581><U+2581><U+2581><U+2587>
B4 375 0.93 4.36 1.67 0 5.00 5.0 5.00 5 <U+2581><U+2581><U+2581><U+2581><U+2587>
B5 1377 0.75 2.35 1.82 0 2.00 2.0 5.00 5 <U+2585><U+2587><U+2581><U+2581><U+2585>
B6 716 0.87 4.39 1.54 0 5.00 5.0 5.00 5 <U+2581><U+2581><U+2581><U+2581><U+2587>
B7 656 0.88 2.26 2.49 0 0.00 0.0 5.00 5 <U+2587><U+2581><U+2581><U+2581><U+2586>
B8 199 0.96 4.50 1.50 0 5.00 5.0 5.00 5 <U+2581><U+2581><U+2581><U+2581><U+2587>
B9 244 0.96 3.93 1.71 0 2.50 5.0 5.00 5 <U+2581><U+2581><U+2582><U+2581><U+2587>
B10 1022 0.81 2.77 1.73 0 1.25 2.5 4.38 5 <U+2583><U+2587><U+2582><U+2585><U+2587>
B11 515 0.91 4.18 1.41 0 3.00 5.0 5.00 5 <U+2581><U+2582><U+2581><U+2581><U+2587>
B12 371 0.93 4.48 1.42 0 5.00 5.0 5.00 5 <U+2581><U+2581><U+2581><U+2581><U+2587>
B13 304 0.94 4.06 1.68 0 3.00 5.0 5.00 5 <U+2581><U+2582><U+2581><U+2581><U+2587>
B14 576 0.90 3.30 1.78 0 2.00 4.0 5.00 5 <U+2582><U+2586><U+2581><U+2581><U+2587>
B15 352 0.94 3.78 1.91 0 2.50 5.0 5.00 5 <U+2582><U+2581><U+2582><U+2581><U+2587>
B16 270 0.95 4.67 1.24 0 5.00 5.0 5.00 5 <U+2581><U+2581><U+2581><U+2581><U+2587>
B17 200 0.96 4.78 1.02 0 5.00 5.0 5.00 5 <U+2581><U+2581><U+2581><U+2581><U+2587>
B18 367 0.93 4.07 1.94 0 5.00 5.0 5.00 5 <U+2582><U+2581><U+2581><U+2581><U+2587>
B19 1044 0.81 3.83 2.11 0 5.00 5.0 5.00 5 <U+2582><U+2581><U+2581><U+2581><U+2587>
B20 920 0.83 2.76 2.49 0 0.00 5.0 5.00 5 <U+2586><U+2581><U+2581><U+2581><U+2587>

Data cleaning

  1. For students who took the test more than once, consider the attempt with the highest scores only and remove the others;

  2. Eliminate the students who scored three or more zeros in the 5 easiest questions in the second-half of the test; and

  3. Add the students scoring more than 30 marks in total back to the sample.

test_scores <- test_scores_unfiltered %>% 
  group_by(AnonID) %>% 
  slice_max(Total, n = 1) %>% 
  ungroup() %>% 
  rowwise() %>% 
  mutate(invalid_in_easiest_5 = sum(is.na(B11), is.na(B12), is.na(B16), is.na(B17), is.na(B18))) %>% 
  filter(invalid_in_easiest_5 <= 2 | Total >= 30) %>% 
  select(-invalid_in_easiest_5) %>% 
  ungroup()

# test_scores <- test_scores_unfiltered %>%
#   group_by(AnonID) %>%
#   slice_max(Total, n = 1) %>%
#   ungroup() %>%
#   filter(Total > 0)

bind_rows(
  "unfiltered" = test_scores_unfiltered %>% select(Total),
  "filtered" = test_scores %>% select(Total),
  .id = "dataset"
) %>% 
  mutate(dataset = fct_relevel(dataset, "unfiltered", "filtered")) %>% 
  ggplot(aes(x = Total)) +
  geom_histogram(bins = 25) +
  facet_wrap(vars(dataset)) +
  theme_minimal()

test_scores %>% 
  mutate(num_na = rowSums(is.na(.))) %>% 
  janitor::tabyl(num_na)
##  num_na    n      percent
##       0 2917 0.5497549943
##       1  790 0.1488880513
##       2  512 0.0964945345
##       3  344 0.0648322654
##       4  234 0.0441010177
##       5  156 0.0294006785
##       6  106 0.0199773841
##       7   84 0.0158311346
##       8   61 0.0114964191
##       9   44 0.0082924991
##      10   29 0.0054655107
##      11   14 0.0026385224
##      12    7 0.0013192612
##      13    4 0.0007538636
##      14    2 0.0003769318
##      15    2 0.0003769318
test_scores %>% 
  mutate(num_na = rowSums(is.na(.))) %>% 
  summarise(total_na = sum(num_na), na_rate = total_na / (n() * 20))
## # A tibble: 1 x 2
##   total_na na_rate
##      <dbl>   <dbl>
## 1     7308  0.0689
test_scores <- test_scores %>% mutate(across(starts_with('B') & where(is.numeric), ~replace_na(.,0)))

Data summary

The number of responses from each class:

test_scores %>% 
  group_by(year) %>% 
  tally() %>% 
  gt() %>% 
  data_color(
    columns = c("n"),
    colors = scales::col_numeric(palette = c("Blues"), domain = NULL)
  )
year n
2017-18 885
2018-19 1118
2019-20 1190
2020-21 1152
2021-22 961

Mean and standard deviation for each item:

test_scores %>% 
  select(-AnonID, -Total) %>% 
  group_by(year) %>% 
  skim_without_charts() %>% 
  select(-contains("character."), -contains("numeric.p"), -skim_type) %>% 
  rename(complete = complete_rate) %>% 
  # make the table wider, i.e. with separate columns for each year's results, with the year at the start of each column name
  pivot_wider(names_from = year, values_from = -c(skim_variable, year), names_glue = "{year}__{.value}") %>% 
  # put the columns in order by year
  select(sort(names(.))) %>% 
  select(skim_variable, everything()) %>% 
  # use GT to make the table look nice
  gt(rowname_col = "skim_variable") %>% 
  # group the columns from each year
  tab_spanner_delim(delim = "__") %>%
  fmt_number(columns = contains("numeric"), decimals = 2) %>%
  fmt_percent(columns = contains("complete"), decimals = 0) %>% 
  # change all the numeric.mean and numeric.sd column names to Mean and SD
  cols_label(
    .list = test_scores %>% select(year) %>% distinct() %>% transmute(col = paste0(year, "__numeric.mean"), label = "Mean") %>% deframe()
  ) %>% 
  cols_label(
    .list = test_scores %>% select(year) %>% distinct() %>% transmute(col = paste0(year, "__numeric.sd"), label = "SD") %>% deframe()
  ) %>%
  data_color(
    columns = contains("numeric.mean"),
    colors = scales::col_numeric(palette = c("Greens"), domain = NULL)
  )
2017-18 2018-19 2019-20 2020-21 2021-22
complete n_missing Mean SD complete n_missing Mean SD complete n_missing Mean SD complete n_missing Mean SD complete n_missing Mean SD
B1 100% 0 4.48 1.45 100% 0 4.40 1.50 100% 0 4.54 1.36 100% 0 4.50 1.40 100% 0 4.43 1.52
B2 100% 0 3.67 2.21 100% 0 3.73 2.18 100% 0 3.92 2.06 100% 0 3.85 2.10 100% 0 3.93 2.05
B3 100% 0 4.14 1.36 100% 0 4.11 1.37 100% 0 4.16 1.38 100% 0 4.21 1.36 100% 0 4.30 1.29
B4 100% 0 4.04 1.97 100% 0 4.11 1.92 100% 0 4.17 1.86 100% 0 4.24 1.79 100% 0 4.17 1.86
B5 100% 0 1.79 1.83 100% 0 1.69 1.84 100% 0 1.81 1.87 100% 0 1.86 1.91 100% 0 1.91 1.96
B6 100% 0 3.90 2.01 100% 0 3.88 2.01 100% 0 4.01 1.94 100% 0 3.97 1.96 100% 0 3.81 2.07
B7 100% 0 1.95 2.44 100% 0 1.95 2.44 100% 0 2.06 2.46 100% 0 2.17 2.48 100% 0 2.09 2.47
B8 100% 0 4.34 1.69 100% 0 4.46 1.55 100% 0 4.47 1.53 100% 0 4.46 1.56 100% 0 4.45 1.56
B9 100% 0 3.75 1.78 100% 0 3.71 1.85 100% 0 3.85 1.80 100% 0 3.98 1.73 100% 0 4.01 1.69
B10 100% 0 2.06 1.83 100% 0 2.12 1.84 100% 0 2.41 1.88 100% 0 2.52 1.91 100% 0 2.45 1.92
B11 100% 0 3.86 1.74 100% 0 3.86 1.73 100% 0 3.96 1.68 100% 0 3.90 1.74 100% 0 3.87 1.76
B12 100% 0 4.27 1.67 100% 0 4.24 1.70 100% 0 4.35 1.61 100% 0 4.31 1.65 100% 0 4.33 1.63
B13 100% 0 3.82 1.84 100% 0 3.91 1.81 100% 0 4.00 1.76 100% 0 4.02 1.75 100% 0 3.96 1.79
B14 100% 0 2.92 1.91 100% 0 2.95 1.90 100% 0 3.11 1.91 100% 0 3.07 1.94 100% 0 3.14 1.97
B15 100% 0 3.64 2.00 100% 0 3.58 2.01 100% 0 3.53 2.06 100% 0 3.79 1.93 100% 0 3.68 2.02
B16 100% 0 4.56 1.41 100% 0 4.51 1.49 100% 0 4.53 1.46 100% 0 4.54 1.44 100% 0 4.72 1.14
B17 100% 0 4.76 1.06 100% 0 4.69 1.20 100% 0 4.74 1.11 100% 0 4.71 1.16 100% 0 4.79 1.00
B18 100% 0 3.83 2.12 100% 0 3.75 2.16 100% 0 3.96 2.03 100% 0 3.95 2.03 100% 0 4.08 1.94
B19 100% 0 2.97 2.46 100% 0 3.14 2.42 100% 0 3.24 2.39 100% 0 3.30 2.37 100% 0 3.36 2.35
B20 100% 0 2.28 2.49 100% 0 2.35 2.50 100% 0 2.42 2.50 100% 0 2.28 2.49 100% 0 2.51 2.50

2. Testing assumptions

Before applying IRT, we should check that the data satisfies the assumptions needed by the model. In particular, to use a 1-dimensional IRT model, we should have some evidence of unidimensionality in the test scores.

Inter-item correlations

If the test is unidimensional then we would expect student scores on pairs of items to be correlated.

This plot shows the correlations between scores on each pair of items:

item_scores <- test_scores %>% 
  select(starts_with("B"), -AnonID)

cor_ci <- psych::corCi(item_scores, plot = FALSE)

psych::cor.plot.upperLowerCi(cor_ci)

Checking for correlations that are not significantly different from 0, there are none:

cor_ci$ci %>% 
  as_tibble(rownames = "corr") %>% 
  filter(p > 0.05) %>% 
  arrange(-p) %>% 
  select(-contains(".e")) %>% 
  gt() %>% 
  fmt_number(columns = 2:4, decimals = 3)
corr lower upper p

The overall picture is that the item scores are well correlated with each other.

Dimensionality

structure <- check_factorstructure(item_scores)
n <- n_factors(item_scores)

Is the data suitable for Factor Analysis?

  • KMO: The Kaiser, Meyer, Olkin (KMO) measure of sampling adequacy suggests that data seems appropriate for factor analysis (KMO = 0.92).
    • Sphericity: Bartlett’s test of sphericity suggests that there is sufficient significant correlation in the data for factor analysis (Chisq(190) = 15907.67, p < .001).

Method Agreement Procedure:

The choice of 1 dimensions is supported by 8 (34.78%) methods out of 23 (t, p, Acceleration factor, R2, VSS complexity 1, Velicer’s MAP, TLI, RMSEA).

plot(n)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

summary(n) %>% gt()
n_Factors n_Methods
1 8
2 5
3 1
4 3
5 1
14 1
15 1
19 3
#n %>% tibble() %>% gt()
fa.parallel(item_scores, fa = "fa")

## Parallel analysis suggests that the number of factors =  6  and the number of components =  NA

1 Factor

We use the factanal function to fit a 1-factor model.

Note that this function cannot handle missing data, so any NA scores must be set to 0 for this analysis.

fitfact <- factanal(item_scores,
                    factors = 1,
                    rotation = "varimax")
print(fitfact, digits = 2, cutoff = 0.3, sort = TRUE)
## 
## Call:
## factanal(x = item_scores, factors = 1, rotation = "varimax")
## 
## Uniquenesses:
##   B1   B2   B3   B4   B5   B6   B7   B8   B9  B10  B11  B12  B13  B14  B15  B16 
## 0.91 0.79 0.72 0.85 0.87 0.77 0.85 0.84 0.82 0.63 0.77 0.84 0.78 0.61 0.92 0.90 
##  B17  B18  B19  B20 
## 0.90 0.88 0.70 0.69 
## 
## Loadings:
##  [1] 0.53 0.61 0.62 0.54 0.56 0.30 0.46 0.39 0.36 0.48 0.39 0.40 0.42 0.48 0.41
## [16] 0.47      0.32 0.31 0.35
## 
##                Factor1
## SS loadings       3.97
## Proportion Var    0.20
## 
## Test of the hypothesis that 1 factor is sufficient.
## The chi square statistic is 1490.87 on 170 degrees of freedom.
## The p-value is 1.19e-209
load <- tidy(fitfact)

load %>% 
  select(question = variable, factor_loading = fl1) %>% 
  left_join(item_info, by = "question") %>% 
  ggplot(aes(x = factor_loading, y = 0, colour = MATH_group)) + 
    geom_point() + 
    geom_label_repel(aes(label = question), show.legend = FALSE) +
    scale_colour_manual(values = MATH_colours) +
  scale_y_discrete() +
    labs(x = "Factor 1", y = NULL,
         title = "Standardised Loadings", 
         subtitle = "Based upon correlation matrix") +
    theme_minimal()

load %>% 
  select(question = variable, factor_loading = fl1) %>% 
  left_join(item_info, by = "question") %>% 
  arrange(-factor_loading) %>% 
  gt() %>%
  data_color(
    columns = contains("factor"),
    colors = scales::col_numeric(palette = c("Greens"), domain = NULL)
  ) %>%
  data_color(
    columns = contains("MATH"),
    colors = MATH_colours
  )
question factor_loading description MATH_group
B14 0.6215092 find minimum gradient of cubic B
B10 0.6105804 2x2 system possibilities B
B20 0.5585814 product rule with given values B
B19 0.5439747 area between curve and x-axis (in 2 parts) B
B3 0.5282770 completing the square A
B11 0.4839993 using max and min functions C
B6 0.4837187 graphical vector sum B
B13 0.4730842 equation of tangent A
B2 0.4622684 composition of functions B
B9 0.4224717 identify graph of rational functions B
B12 0.4055023 find point with given slope A
B8 0.3954844 simplify logs A
B4 0.3895603 trig double angle formula A
B7 0.3893010 angle between 3d vectors (in context) B
B5 0.3623280 trig wave function A
B18 0.3507785 definite integral A
B16 0.3150715 trig chain rule A
B17 0.3116017 chain rule A
B1 0.3003783 properties of fractions A
B15 0.2776236 find and classify stationary points of cubic A

It is striking here that the MATH Group B questions are those that load most strongly onto this factor.

2 Factor

Here we also investigate the 2-factor solution, to see whether these factors are interpretable.

fitfact2 <- factanal(item_scores, factors = 2, rotation = "varimax")
print(fitfact2, digits = 2, cutoff = 0.3, sort = TRUE)
## 
## Call:
## factanal(x = item_scores, factors = 2, rotation = "varimax")
## 
## Uniquenesses:
##   B1   B2   B3   B4   B5   B6   B7   B8   B9  B10  B11  B12  B13  B14  B15  B16 
## 0.91 0.78 0.72 0.85 0.87 0.75 0.84 0.84 0.82 0.59 0.76 0.84 0.78 0.61 0.91 0.71 
##  B17  B18  B19  B20 
## 0.64 0.84 0.70 0.69 
## 
## Loadings:
##     Factor1 Factor2
## B10 0.63           
## B14 0.58           
## B20 0.52           
## B16         0.52   
## B17         0.60   
## B1                 
## B2  0.45           
## B3  0.49           
## B4  0.35           
## B5  0.35           
## B6  0.49           
## B7  0.40           
## B8  0.33           
## B9  0.38           
## B11 0.46           
## B12 0.36           
## B13 0.43           
## B15                
## B18         0.33   
## B19 0.48           
## 
##                Factor1 Factor2
## SS loadings       3.29    1.26
## Proportion Var    0.16    0.06
## Cumulative Var    0.16    0.23
## 
## Test of the hypothesis that 2 factors are sufficient.
## The chi square statistic is 821.93 on 151 degrees of freedom.
## The p-value is 7.56e-93
load2 <- tidy(fitfact2)

load2_plot <- load2 %>%
  rename(question = variable) %>% 
  left_join(item_info, by = "question") %>%
  ggplot(aes(x = fl1, y = fl2, colour = MATH_group, shape = MATH_group)) +
  geom_point() +
  geom_text_repel(aes(label = question), show.legend = FALSE, alpha = 0.6) +
  labs(
    x = "Factor 1 (of 2)",
    y = "Factor 2 (of 2)"
  ) +
  scale_colour_manual("MATH group", values = MATH_colours) +
  scale_shape_manual(name = "MATH group", values = c(19, 17, 18)) +
  theme_minimal()

load2_plot +
  labs(
    title = "Standardised Loadings",
    subtitle = "Showing the 2-factor model"
  )

ggsave("output/uoe_post_2factor.pdf", units = "cm", width = 12, height = 8, dpi = 300,
       plot = load2_plot)
main_factors <- load2 %>% 
#  mutate(factorNone = 0.4) %>%  # add this to set the main factor to "None" where all loadings are below 0.4
  pivot_longer(names_to = "factor",
               cols = contains("fl")) %>% 
  mutate(value_abs = abs(value)) %>% 
  group_by(variable) %>% 
  top_n(1, value_abs) %>% 
  ungroup() %>% 
  transmute(main_factor = factor, variable)


load2 %>% 
  select(-uniqueness) %>% 
  # add the info about which is the main factor
  left_join(main_factors, by = "variable") %>%
  left_join(item_info %>% select(variable = question, description, MATH_group), by = "variable") %>% 
  arrange(main_factor) %>% 
  select(main_factor, everything()) %>% 
  # arrange adjectives by descending loading on main factor
  rowwise() %>% 
  mutate(max_loading = max(abs(c_across(starts_with("fl"))))) %>% 
  group_by(main_factor) %>% 
  arrange(-max_loading, .by_group = TRUE) %>% 
  select(-max_loading) %>% 
  # sort out the presentation
  rename("Main Factor" = main_factor,
         "Question" = variable) %>%
  mutate_at(
    vars(starts_with("fl")),
    ~ cell_spec(round(., digits = 3), bold = if_else(abs(.) > 0.4, T, F))
  ) %>% 
  kable(booktabs = T, escape = F, longtable = T) %>% 
  kableExtra::collapse_rows(columns = 1, valign = "top") %>%
  kableExtra::kable_styling(latex_options = c("repeat_header"))
Main Factor Question fl1 fl2 description MATH_group
fl1 B10 0.628 0.114 2x2 system possibilities B
B14 0.583 0.217 find minimum gradient of cubic B
B20 0.523 0.203 product rule with given values B
B3 0.493 0.19 completing the square A
B6 0.488 0.101 graphical vector sum B
B19 0.478 0.26 area between curve and x-axis (in 2 parts) B
B11 0.465 0.149 using max and min functions C
B2 0.455 0.12 composition of functions B
B13 0.43 0.19 equation of tangent A
B7 0.398 0.07 angle between 3d vectors (in context) B
B9 0.38 0.182 identify graph of rational functions B
B12 0.357 0.189 find point with given slope A
B4 0.35 0.168 trig double angle formula A
B5 0.345 0.114 trig wave function A
B8 0.329 0.227 simplify logs A
B1 0.27 0.131 properties of fractions A
fl2 B17 0.083 0.598 chain rule A
B16 0.117 0.521 trig chain rule A
B18 0.233 0.332 definite integral A
B15 0.203 0.219 find and classify stationary points of cubic A

TODO

3. Fitting IRT model

The mirt implementation of the graded partial credit model (gpmc) requires that the partial marks are consecutive integers. We therefore need to work around this by adjusting our scores into that form (e.g. replacing scores of 0, 2.5, 5 with 1, 2, 3), while keeping track of the true scores attached to each mark level so that we can properly compute expected scores later on.

# Determine the mark levels for each item
mark_levels <- item_scores %>% 
  pivot_longer(everything(), names_to = "item", values_to = "score") %>% 
  distinct() %>% 
  arrange(parse_number(item), score) %>% 
  group_by(item) %>%
  mutate(order = row_number()) %>% 
# Note that the convention used by mirt is for items that have only 2 levels (i.e. 0 marks or full marks),
# the columns are P.0 and P.1, while other items are indexed from 1, i.e. P.1, P.2, ...
# https://github.com/philchalmers/mirt/blob/accd2383b9a4d17a4cab269717ce98434900b62c/R/probtrace.R#L57
  mutate(level = case_when(
    max(order) == 2 ~ order - 1,
    TRUE ~ order * 1.0
  )) %>% 
  mutate(levelname = paste0(item, ".P.", level))

# Use the mark_levels table to replace scores with levels
# (first pivot the data to long form, make the replacement, then pivot back to wide again)
item_scores_levelled <- item_scores %>% 
  # temporarily add row identifiers
  mutate(row = row_number()) %>% 
  pivot_longer(cols = -row, names_to = "item", values_to = "score") %>% 
  left_join(mark_levels %>% select(item, score, level), by = c("item", "score")) %>% 
  select(-score) %>% 
  pivot_wider(names_from = "item", values_from = "level") %>% 
  select(-row)

Show model fitting output

fit_gpcm <- mirt(
  data = item_scores_levelled, # just the columns with question scores
  model = 1,          # number of factors to extract
  itemtype = "gpcm",  # generalised partial credit model
  SE = TRUE           # estimate standard errors
  )
## 
Iteration: 1, Log-Lik: -83647.524, Max-Change: 3.33457
Iteration: 2, Log-Lik: -77201.184, Max-Change: 1.90505
Iteration: 3, Log-Lik: -75872.700, Max-Change: 1.22448
Iteration: 4, Log-Lik: -75437.373, Max-Change: 0.22514
Iteration: 5, Log-Lik: -75342.462, Max-Change: 0.30759
Iteration: 6, Log-Lik: -75313.426, Max-Change: 0.20640
Iteration: 7, Log-Lik: -75303.818, Max-Change: 0.10930
Iteration: 8, Log-Lik: -75299.897, Max-Change: 0.08989
Iteration: 9, Log-Lik: -75298.199, Max-Change: 0.05732
Iteration: 10, Log-Lik: -75297.494, Max-Change: 0.13424
Iteration: 11, Log-Lik: -75296.653, Max-Change: 0.00524
Iteration: 12, Log-Lik: -75296.486, Max-Change: 0.00120
Iteration: 13, Log-Lik: -75296.478, Max-Change: 0.00116
Iteration: 14, Log-Lik: -75296.449, Max-Change: 0.00153
Iteration: 15, Log-Lik: -75296.432, Max-Change: 0.00054
Iteration: 16, Log-Lik: -75296.430, Max-Change: 0.00048
Iteration: 17, Log-Lik: -75296.425, Max-Change: 0.00054
Iteration: 18, Log-Lik: -75296.421, Max-Change: 0.00031
Iteration: 19, Log-Lik: -75296.421, Max-Change: 0.00023
Iteration: 20, Log-Lik: -75296.420, Max-Change: 0.00045
Iteration: 21, Log-Lik: -75296.417, Max-Change: 0.00013
Iteration: 22, Log-Lik: -75296.417, Max-Change: 0.00011
Iteration: 23, Log-Lik: -75296.417, Max-Change: 0.00021
Iteration: 24, Log-Lik: -75296.415, Max-Change: 0.00021
Iteration: 25, Log-Lik: -75296.414, Max-Change: 0.00018
Iteration: 26, Log-Lik: -75296.414, Max-Change: 0.00034
Iteration: 27, Log-Lik: -75296.413, Max-Change: 0.00013
Iteration: 28, Log-Lik: -75296.413, Max-Change: 0.00011
Iteration: 29, Log-Lik: -75296.413, Max-Change: 0.00019
Iteration: 30, Log-Lik: -75296.412, Max-Change: 0.00009
## 
## Calculating information matrix...

Local independence

We compute Yen’s \(Q_3\) (1984) to check for any dependence between items after controlling for \(\theta\). This gives a score for each pair of items, with scores above 0.2 regarded as problematic (see DeMars, p. 48).

residuals  %>% as.matrix() %>% 
  corrplot::corrplot(type = "upper")

This shows that most item pairs are independent, with only one pair showing cause for concern:

residuals %>%
  rownames_to_column(var = "item1") %>%
  as_tibble() %>% 
  pivot_longer(cols = starts_with("B"), names_to = "item2", values_to = "Q3_score") %>% 
  filter(abs(Q3_score) > 0.2) %>% 
  filter(parse_number(item1) < parse_number(item2)) %>%
  gt()
item1 item2 Q3_score
B16 B17 0.2382038

Items A16 and A17 are on the chain rule (e.g. differentiating \(\cos(4x^2+5)\) and \((3x^2-8)^3\) respectively), so it is perhaps unsurprising that students’ performance on these items is not entirely independent.

Given that this violation of the local independence assumption is very mild, we proceed using this model.

Model parameters

We augment the data with estimated abilities for each student, using mirt’s fscores() function.

test_scores_with_ability <- test_scores %>%
  mutate(F1 = fscores(fit_gpcm, method = "EAP"))

Next, we extract the IRT parameters.

coefs_gpcm <- coef(fit_gpcm, IRTpars = TRUE)

We use a modified version of the tidy_mirt_coeffs function to get all the parameter estimates into a tidy table:

tidy_mirt_coefs <- function(x){
  x %>%
    # melt the list element
    melt() %>%
    # convert to a tibble
    as_tibble() %>%
    # convert factors to characters
    mutate(across(where(is.factor), as.character)) %>%
    # only focus on rows where X2 is a, or starts with b (the parameters in the GPCM)
    filter(X2 == "a" | str_detect(X2, "^b")) %>%
    # in X1, relabel par (parameter) as est (estimate)
    mutate(X1 = if_else(X1 == "par", "est", X1)) %>%
    # turn into a wider data frame
    pivot_wider(names_from = X1, values_from = value) %>% 
    rename(par = X2)
}

# use head(., -1) to remove the last element, `GroupPars`, which does not correspond to a question
tidy_gpcm <- map_dfr(head(coefs_gpcm, -1), tidy_mirt_coefs, .id = "Question")
tidy_gpcm %>% 
  filter(par == "a") %>% 
  select(-par) %>% 
  rename_with(.fn = ~ paste0("a_", .x), .cols = -Question) %>% 
  left_join(
    tidy_gpcm %>% 
      filter(str_detect(par, "^b")),
    by = "Question"
  ) %>% 
  gt(groupname_col = "Question") %>%
  fmt_number(columns = contains("est|_"), decimals = 3) %>%
  data_color(
    columns = contains("a_"),
    colors = scales::col_numeric(palette = c("Greens"), domain = NULL)
  ) %>%
  data_color(
    columns = c("est", "CI_2.5", "CI_97.5"),
    colors = scales::col_numeric(palette = c("Blues"), domain = NULL)
  ) %>%
  tab_spanner(label = "Discrimination", columns = contains("a_")) %>%
  tab_spanner(label = "Difficulty", columns = c("par", "est", "CI_2.5", "CI_97.5"))
Discrimination Difficulty
a_est a_CI_2.5 a_CI_97.5 par est CI_2.5 CI_97.5
B1
0.5538763 0.4903944 0.6173583 b1 0.44758709 1.095962e-01 0.7855780
0.5538763 0.4903944 0.6173583 b2 -5.47424095 -6.115404e+00 -4.8330782
B2
1.3682210 1.2565296 1.4799124 b1 -1.15235818 -1.234598e+00 -1.0701186
B3
0.3276120 0.3016218 0.3536022 b1 -1.41935162 -2.115406e+00 -0.7232974
0.3276120 0.3016218 0.3536022 b2 -0.28775553 -1.036459e+00 0.4609477
0.3276120 0.3016218 0.3536022 b3 -3.00757852 -3.708304e+00 -2.3068526
0.3276120 0.3016218 0.3536022 b4 -0.76878717 -1.332725e+00 -0.2048493
0.3276120 0.3016218 0.3536022 b5 -1.90880866 -2.424875e+00 -1.3927423
0.3276120 0.3016218 0.3536022 b6 -0.05309172 -5.130185e-01 0.4068350
0.3276120 0.3016218 0.3536022 b7 1.41093226 8.603929e-01 1.9614716
0.3276120 0.3016218 0.3536022 b8 -8.57743560 -9.398995e+00 -7.7558761
B4
1.1783937 1.0686988 1.2880886 b1 -1.67830802 -1.805471e+00 -1.5511448
B5
0.3850551 0.3497213 0.4203889 b1 0.16725222 -6.019929e-04 0.3351064
0.3850551 0.3497213 0.4203889 b2 10.98964874 9.631310e+00 12.3479873
0.3850551 0.3497213 0.4203889 b3 -8.91926582 -1.021830e+01 -7.6202330
B6
0.7698033 0.7074720 0.8321347 b1 1.39175594 1.121106e+00 1.6624063
0.7698033 0.7074720 0.8321347 b2 -3.83959915 -4.190396e+00 -3.4888025
B7
0.9879617 0.9038741 1.0720493 b1 0.44252659 3.705352e-01 0.5145180
B8
1.4363523 1.2968807 1.5758239 b1 -1.90990745 -2.042704e+00 -1.7771112
B9
0.7072840 0.6470002 0.7675678 b1 -1.21816238 -1.363316e+00 -1.0730090
0.7072840 0.6470002 0.7675678 b2 -1.75800905 -1.935106e+00 -1.5809118
B10
0.3782720 0.3522225 0.4043215 b1 5.00160599 4.399857e+00 5.6033549
0.3782720 0.3522225 0.4043215 b2 -5.30184180 -5.861207e+00 -4.7424766
0.3782720 0.3522225 0.4043215 b3 2.26760158 1.901003e+00 2.6342005
0.3782720 0.3522225 0.4043215 b4 -0.40266707 -7.764147e-01 -0.0289194
0.3782720 0.3522225 0.4043215 b5 0.97909178 5.841997e-01 1.3739839
0.3782720 0.3522225 0.4043215 b6 -0.53651685 -9.308384e-01 -0.1421953
0.3782720 0.3522225 0.4043215 b7 1.86494579 1.451997e+00 2.2778949
0.3782720 0.3522225 0.4043215 b8 -2.96095434 -3.410100e+00 -2.5118084
B11
0.3551384 0.3271714 0.3831053 b1 2.06361761 1.475744e+00 2.6514908
0.3551384 0.3271714 0.3831053 b2 -5.93186807 -6.554252e+00 -5.3094836
0.3551384 0.3271714 0.3831053 b3 9.95820327 8.564649e+00 11.3517579
0.3551384 0.3271714 0.3831053 b4 -4.02507289 -5.294966e+00 -2.7551795
0.3551384 0.3271714 0.3831053 b5 -9.94349022 -1.092011e+01 -8.9668666
B12
0.4824899 0.4381134 0.5268664 b1 2.50987607 1.972264e+00 3.0474886
0.4824899 0.4381134 0.5268664 b2 -1.94055862 -2.407669e+00 -1.4734478
0.4824899 0.4381134 0.5268664 b3 -5.97320582 -6.576823e+00 -5.3695888
B13
0.5201113 0.4777773 0.5624453 b1 -1.11114750 -1.320071e+00 -0.9022237
0.5201113 0.4777773 0.5624453 b2 3.92728685 3.355233e+00 4.4993403
0.5201113 0.4777773 0.5624453 b3 -7.34347899 -8.078829e+00 -6.6081289
B14
0.5240531 0.4876252 0.5604810 b1 1.45635488 1.132155e+00 1.7805550
0.5240531 0.4876252 0.5604810 b2 -4.16567166 -4.520855e+00 -3.8104883
0.5240531 0.4876252 0.5604810 b3 9.62758251 8.257883e+00 10.9972816
0.5240531 0.4876252 0.5604810 b4 -4.83893632 -6.111551e+00 -3.5663221
0.5240531 0.4876252 0.5604810 b5 -5.01890930 -5.530392e+00 -4.5074267
B15
0.1801936 0.1588291 0.2015582 b1 13.17000155 1.113448e+01 15.2055255
0.1801936 0.1588291 0.2015582 b2 -10.57602865 -1.229849e+01 -8.8535687
0.1801936 0.1588291 0.2015582 b3 6.23991570 5.051512e+00 7.4283189
0.1801936 0.1588291 0.2015582 b4 -16.04855068 -1.813916e+01 -13.9579415
B16
1.1973101 1.0605930 1.3340272 b1 -2.41235550 -2.621102e+00 -2.2036086
B17
1.5010099 1.3174025 1.6846173 b1 -2.51216426 -2.718915e+00 -2.3054131
B18
0.9398928 0.8476703 1.0321152 b1 -1.60100476 -1.742416e+00 -1.4595933
B19
1.5845071 1.4678601 1.7011542 b1 -0.53566496 -5.902717e-01 -0.4810583
B20
1.7581035 1.6310290 1.8851780 b1 0.08521633 3.822567e-02 0.1322070
tidy_gpcm %>% 
  write_csv("output/uoe_post_gpcm-results.csv")

Information curves

theta <- seq(-6, 6, by=0.05)

info_matrix <- testinfo(fit_gpcm, theta, individual = TRUE)
colnames(info_matrix) <- item_info %>% pull(question)
item_info_data <- info_matrix %>% 
  as_tibble() %>% 
  bind_cols(theta = theta) %>% 
  pivot_longer(cols = -theta, names_to = "item", values_to = "info_y") %>% 
  left_join(item_info %>% select(item = question, MATH_group), by = "item") %>% 
  mutate(item = fct_reorder(item, parse_number(item)))

Test information curve

item_info_data %>% 
  group_by(theta) %>% 
  summarise(info_y = sum(info_y)) %>% 
  ggplot(aes(x = theta, y = info_y)) +
  geom_line() +
  labs(y = "Information") +
  theme_minimal()

ggsave("output/uoe_post_info.pdf", width = 6, height = 4, units = "in")

This shows that the information given by the test is skewed toward the lower end of the ability scale - i.e. it can give more accurate estimates of students’ ability where their ability level is slightly below the mean.

Item information curves

Breaking this down by question helps to highlight those questions that are most/least informative:

item_info_data %>% 
  ggplot(aes(x = theta, y = info_y, colour = item)) +
  geom_line() +
  scale_colour_viridis_d(option = "plasma", end = 0.8, direction = -1) +
  facet_wrap(vars(item)) +
  labs(y = "Information") +
  theme_minimal()

We can also compute the sums of different subsets of the information curves – here, we will look at the questions based on their MATH group:

item_info_data %>% 
  group_by(theta) %>% 
  summarise(
    items_all = sum(info_y),
    items_A = sum(ifelse(MATH_group == "A", info_y, 0)),
    items_B = sum(ifelse(MATH_group == "B", info_y, 0)),
    items_C = sum(ifelse(MATH_group == "C", info_y, 0))
  ) %>% 
  pivot_longer(cols = starts_with("items_"), names_to = "items", names_prefix = "items_", values_to = "info_y") %>% 
  mutate(items = fct_relevel(items, "all", "A", "B", "C")) %>% 
  ggplot(aes(x = theta, y = info_y, colour = items)) +
  geom_line() +
  scale_colour_manual(values = c("all" = "#000000", MATH_colours)) +
  labs(x = "Ability", y = "Information") +
  theme_minimal()

ggsave("output/uoe_post_info-curves_A-vs-B.pdf", units = "cm", width = 14, height = 6)

This shows that the information in the MATH Group B questions is at a higher point on the ability scale than for the MATH Group A questions.

Since the number of items in each case is different, we consider instead the mean information per item:

item_info_data %>% 
  group_by(theta) %>% 
  summarise(
    items_all = sum(info_y) / n(),
    items_A = sum(ifelse(MATH_group == "A", info_y, 0)) / sum(ifelse(MATH_group == "A", 1, 0)),
    items_B = sum(ifelse(MATH_group == "B", info_y, 0)) / sum(ifelse(MATH_group == "B", 1, 0)),
    items_C = sum(ifelse(MATH_group == "C", info_y, 0)) / sum(ifelse(MATH_group == "C", 1, 0))
  ) %>% 
  pivot_longer(cols = starts_with("items_"), names_to = "items", names_prefix = "items_", values_to = "info_y") %>% 
  mutate(items = fct_relevel(items, "all", "A", "B", "C")) %>% 
  ggplot(aes(x = theta, y = info_y, colour = items)) +
  geom_line() +
  scale_colour_manual(values = c("all" = "#000000", MATH_colours)) +
  labs(x = "Ability", y = "Mean information per item") +
  theme_minimal()

ggsave("output/uoe_post_info-curves_A-vs-B-avg.pdf", units = "cm", width = 10, height = 6)

This shows that items of each MATH group are giving broadly similar levels of information on average, but at different points on the ability scale.

Total information

Using mirt’s areainfo function, we can find the total area under the information curves.

info_gpcm <- areainfo(fit_gpcm, c(-4,4))
info_gpcm %>% gt()
LowerBound UpperBound Info TotalInfo Proportion nitems
-4 4 28.75089 30.87134 0.9313134 20

This shows that the total information in all items is 30.8713383.

Information by item

tidy_info <- item_info %>%
  mutate(item_num = row_number()) %>% 
  mutate(TotalInfo = purrr::map_dbl(
    item_num,
    ~ areainfo(fit_gpcm,
               c(-4, 4),
               which.items = .x) %>% pull(TotalInfo)
  ))

tidy_info %>%
  select(-item_num) %>% 
  arrange(-TotalInfo) %>% 
  #group_by(outcome) %>% 
  gt() %>% 
  fmt_number(columns = contains("a_"), decimals = 2) %>%
  fmt_number(columns = contains("b_"), decimals = 2) %>%
  data_color(
    columns = contains("info"),
    colors = scales::col_numeric(palette = c("Greens"), domain = NULL)
  ) %>%
  data_color(
    columns = contains("outcome"),
    colors = scales::col_factor(palette = c("viridis"), domain = NULL)
  ) %>%
  cols_label(
    TotalInfo = "Information"
  )
question description MATH_group Information
B10 2x2 system possibilities B 3.0213876
B14 find minimum gradient of cubic B 2.6186528
B3 completing the square A 2.5997337
B11 using max and min functions C 1.7682967
B20 product rule with given values B 1.7581034
B19 area between curve and x-axis (in 2 parts) B 1.5845065
B13 equation of tangent A 1.5551973
B6 graphical vector sum B 1.5394668
B17 chain rule A 1.5009902
B12 find point with given slope A 1.4460430
B8 simplify logs A 1.4363393
B9 identify graph of rational functions B 1.4129702
B2 composition of functions B 1.3682131
B16 trig chain rule A 1.1971739
B4 trig double angle formula A 1.1783275
B5 trig wave function A 1.1465660
B1 properties of fractions A 1.1056767
B7 angle between 3d vectors (in context) B 0.9878507
B18 definite integral A 0.9395251
B15 find and classify stationary points of cubic A 0.7063178

Restricting instead to the range \(-2\leq\theta\leq2\):

tidy_info <- item_info %>%
  mutate(item_num = row_number()) %>% 
  mutate(TotalInfo = purrr::map_dbl(
    item_num,
    ~ areainfo(fit_gpcm,
               c(-2, 2),
               which.items = .x) %>% pull(Info)
  ))

tidy_info %>%
  select(-item_num) %>% 
  arrange(-TotalInfo) %>% 
  #group_by(outcome) %>% 
  gt() %>% 
  fmt_number(columns = contains("a_"), decimals = 2) %>%
  fmt_number(columns = contains("b_"), decimals = 2) %>%
  data_color(
    columns = contains("info"),
    colors = scales::col_numeric(palette = c("Greens"), domain = NULL)
  ) %>%
  data_color(
    columns = contains("outcome"),
    colors = scales::col_factor(palette = c("viridis"), domain = NULL)
  ) %>%
  cols_label(
    TotalInfo = "Information"
  )
question description MATH_group Information
B10 2x2 system possibilities B 2.5230197
B14 find minimum gradient of cubic B 2.1365093
B20 product rule with given values B 1.6556015
B3 completing the square A 1.5720938
B19 area between curve and x-axis (in 2 parts) B 1.4147516
B6 graphical vector sum B 1.1408163
B11 using max and min functions C 1.1241541
B2 composition of functions B 1.0235314
B13 equation of tangent A 1.0188476
B9 identify graph of rational functions B 0.8287289
B12 find point with given slope A 0.7970346
B8 simplify logs A 0.7593709
B7 angle between 3d vectors (in context) B 0.7321801
B4 trig double angle formula A 0.6843077
B5 trig wave function A 0.6350054
B18 definite integral A 0.5262348
B17 chain rule A 0.4737239
B16 trig chain rule A 0.4477539
B1 properties of fractions A 0.3976575
B15 find and classify stationary points of cubic A 0.3070112

Response curves

Since the gpcm model is more complicated, there is a characteristic curve for each possible score on the question:

trace_data <- probtrace(fit_gpcm, theta) %>% 
  as_tibble() %>% 
  bind_cols(theta = theta) %>% 
  pivot_longer(cols = -theta, names_to = "level", values_to = "y") %>% 
  left_join(mark_levels %>% select(item, level = levelname, score), by = "level") %>% 
  mutate(score = as.factor(score))

trace_data %>% 
  mutate(item = fct_reorder(item, parse_number(item))) %>% 
  ggplot(aes(x = theta, y = y, colour = score)) +
  geom_line() +
  scale_colour_viridis_d(option = "plasma", end = 0.8, direction = -1) +
  facet_wrap(vars(item)) +
  labs(y = "Probability of response") +
  theme_minimal()

To get a simplified picture for each question, we compute the expected score at each ability level:

expected_scores <- trace_data %>% 
  mutate(item = fct_reorder(item, parse_number(item))) %>% 
  group_by(item, theta) %>% 
  summarise(expected_score = sum(as.double(as.character(score)) * y), .groups = "drop")

expected_scores %>% 
  ggplot(aes(x = theta, y = expected_score, colour = item)) +
  geom_line() +
  scale_colour_viridis_d(option = "plasma", end = 0.8, direction = -1) +
  facet_wrap(vars(item)) +
  labs(y = "Expected score") +
  theme_minimal()

The resulting curves look quite similar to those from the 2PL, allowing for some similar interpretation. For instance, superimposing all the curves shows that there is a spread of difficulties (i.e. thetas where the expected score is 2.5/5) and that some questions are more discriminating than others (i.e. steeper slopes):

plt <- expected_scores %>% 
  ggplot(aes(x = theta, y = expected_score, colour = item, text = item)) +
  geom_line() +
  scale_colour_viridis_d(option = "plasma", end = 0.8, direction = -1) +
  labs(y = "Expected score") +
  theme_minimal()

ggplotly(plt, tooltip = "text")
ggsave(plot = plt, file = "output/uoe_post_iccs-superimposed.pdf", width = 20, height = 14, units = "cm")

Test response curve

total_expected_score <- expected_scores %>% 
  group_by(theta) %>% 
  summarise(expected_score = sum(expected_score))

total_expected_score %>% 
  ggplot(aes(x = theta, y = expected_score)) +
  geom_line() +
  # geom_point(data = total_expected_score %>% filter(theta == 0)) +
  # ggrepel::geom_label_repel(data = total_expected_score %>% filter(theta == 0), aes(label = round(expected_score, 1)), box.padding = 0.5) +
  scale_colour_viridis_d(option = "plasma", end = 0.8, direction = -1) +
  labs(y = "Expected score") +
  theme_minimal()

Packages

In this analysis we used the following packages. You can learn more about each one by clicking on the links below.

  • mirt: For IRT analysis
  • psych: For factor analysis
  • tidyverse: For data wrangling and visualisation
  • reshape: For reshaping nested lists
  • vroom: For reading in many files at once
  • broom: For tidying model output
  • fs: For file system operations
  • gt: For formatting tables
  • knitr: For markdown tables
  • ggrepel: For labelling points without overlap
  • skimr: For data frame level summary
  • ggridges: For ridge plots